started: AL26Apr2019
last updated: AL01Oct2019
Input sequencing data: 3,675 vars x 515 samples (258UBC + 257CBC)
Input eigenvectors for 515 samples (calculated using 468 non-rare variants not in LD)
Add eigenvectors to phenotypes, make PCA plots and
Mark 33 PC outliers: outside of mean +/- 3*sd on the top 2 eigenvectors
Sys.time()
## [1] "2019-10-01 20:12:10 BST"
rm(list=ls())
graphics.off()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s13_ampliseq_only_PCA/s03_explore_PCA_plots_exclude_outliers"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F)
options(warnPartialMatchArgs = T,
warnPartialMatchAttr = T,
warnPartialMatchDollar = T)
#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588
# Threshold for outlier detection
th <- 3 # mean +/- (th*sd)
# Sequencing data (for wecare phenotypes: case/control status)
source_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s11_remove_BRCA_PALB_carriers"
load(paste(source_folder, "s01_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s13_ampliseq_only_PCA/s03_explore_PCA_plots_exclude_outliers"
# Eigenvectors & eigenvalues
source_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s13_ampliseq_only_PCA/s02_calculate_ampliseq_only_PCs/data/s04_pca"
eigenvectors_file <- paste(source_folder, "ampliseq_only_468_515_100PCs.eigenvec", sep="/")
eigenvalues_file <- paste(source_folder, "ampliseq_only_468_515_100PCs.eigenval", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")
# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file)
# List of objects
ls()
## [1] "base_folder" "eigenvalues.df" "eigenvectors.df" "genotypes.mx" "phenotypes.df" "th" "variants.df"
# Dimentions of objects
dim(eigenvectors.df)
## [1] 515 102
dim(eigenvalues.df)
## [1] 100 1
dim(genotypes.mx)
## [1] 3675 515
dim(phenotypes.df)
## [1] 515 24
dim(variants.df)
## [1] 3675 65
# Counts of nfe/controls/cases
table(phenotypes.df$cc)
##
## 0 1
## 258 257
# Update eigenvectors
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"long_ids" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
## long_ids PC1 PC2 PC3 PC4
## 100_S8_L007 100_S8_L007 -0.01912180 -0.02175980 0.0322586 -0.0105334
## 101_S528_L008 101_S528_L008 -0.01236830 -0.00464628 0.0165334 -0.0237710
## 102_S466_L008 102_S466_L008 -0.00735614 0.00498924 -0.0278440 0.0358669
## 103_S147_L007 103_S147_L007 0.08065560 0.08635970 0.0940607 0.0189153
## 104_S325_L008 104_S325_L008 0.01149850 -0.00584936 0.0375899 0.0245731
plot(eigenvalues.df$V1, type="b",
xlab="Eigenvector", ylab="Eigenvalue",
ylim=c(0,20),main="Top 100 eigenvectors")
plot(eigenvalues.df$V1[1:20], type="b",
xlab="Eigenvector", ylab="Eigenvalue",
ylim=c(0,20), main="Top 20 eigenvectors")
rm(eigenvalues.df)
# Add factor column with verbal lables for phenotypes to be used in plot
group <- factor(phenotypes.df$cc, levels = c(0,1), labels = c("UBC","CBC"))
phenotypes.df <- data.frame(phenotypes.df,group)
table(phenotypes.df$group)
##
## UBC CBC
## 258 257
# Add 5 top eigenvectors to phenotypes
phenotypes.df <- full_join(phenotypes.df,eigenvectors.df[,1:6], by="long_ids")
rownames(phenotypes.df) <- phenotypes.df$long_ids
# Prepare colour scale for ggplots
myColours <- c("UBC"="blue", "CBC"="red")
myColourScale <- scale_colour_manual(values=myColours)
# Clean-up
rm(group, eigenvectors.df, myColours)
pc1_mean <- mean(phenotypes.df$PC1)
pc1_sd <- sd(phenotypes.df$PC1)
pc1_min <- pc1_mean - pc1_sd * th
pc1_max <- pc1_mean + pc1_sd * th
pc2_mean <- mean(phenotypes.df$PC2)
pc2_sd <- sd(phenotypes.df$PC2)
pc2_min <- pc2_mean - pc2_sd * th
pc2_max <- pc2_mean + pc2_sd * th
pc3_mean <- mean(phenotypes.df$PC3)
pc3_sd <- sd(phenotypes.df$PC3)
pc3_min <- pc3_mean - pc3_sd * th
pc3_max <- pc3_mean + pc3_sd * th
pc4_mean <- mean(phenotypes.df$PC4)
pc4_sd <- sd(phenotypes.df$PC4)
pc4_min <- pc4_mean - pc4_sd * th
pc4_max <- pc4_mean + pc4_sd * th
pc5_mean <- mean(phenotypes.df$PC5)
pc5_sd <- sd(phenotypes.df$PC5)
pc5_min <- pc5_mean - pc5_sd * th
pc5_max <- pc5_mean + pc5_sd * th
rm(pc1_mean, pc1_sd,
pc2_mean, pc2_sd,
pc3_mean, pc3_sd,
pc4_mean, pc4_sd,
pc5_mean, pc5_sd,
th)
# Detect outliers on each PC
pc1_outlier <- phenotypes.df$PC1 < pc1_min | phenotypes.df$PC1 > pc1_max
pc2_outlier <- phenotypes.df$PC2 < pc2_min | phenotypes.df$PC2 > pc2_max
pc3_outlier <- phenotypes.df$PC3 < pc3_min | phenotypes.df$PC3 > pc3_max
pc4_outlier <- phenotypes.df$PC4 < pc4_min | phenotypes.df$PC4 > pc4_max
pc5_outlier <- phenotypes.df$PC5 < pc5_min | phenotypes.df$PC5 > pc5_max
# Detect outliers on any PC
pc_outlier <- pc1_outlier | pc2_outlier
#pc_outlier <- pc1_outlier | pc2_outlier | pc3_outlier | pc4_outlier | pc5_outlier
# Check counts of outliers
sum(pc1_outlier)
## [1] 20
sum(pc2_outlier)
## [1] 14
sum(pc3_outlier)
## [1] 3
sum(pc4_outlier)
## [1] 3
sum(pc5_outlier)
## [1] 2
sum(pc_outlier)
## [1] 33
# Add outliers data to phenotypes dataframe
phenotypes.df <- data.frame(phenotypes.df, pc_outlier, pc1_outlier, pc2_outlier,
pc3_outlier, pc4_outlier, pc5_outlier)
phenotypes.df %>%
filter(pc_outlier) %>%
select(Sample_num, pc_outlier, pc1_outlier, pc2_outlier,
pc3_outlier, pc4_outlier, pc5_outlier) %>%
arrange(as.integer(Sample_num))
## Sample_num pc_outlier pc1_outlier pc2_outlier pc3_outlier pc4_outlier pc5_outlier
## 1 3 TRUE TRUE FALSE FALSE FALSE FALSE
## 2 16 TRUE TRUE TRUE FALSE FALSE FALSE
## 3 141 TRUE TRUE FALSE FALSE FALSE FALSE
## 4 148 TRUE TRUE FALSE FALSE FALSE FALSE
## 5 235 TRUE FALSE TRUE FALSE FALSE FALSE
## 6 238 TRUE FALSE TRUE FALSE FALSE FALSE
## 7 246 TRUE TRUE FALSE FALSE FALSE FALSE
## 8 256 TRUE FALSE TRUE FALSE FALSE FALSE
## 9 267 TRUE FALSE TRUE FALSE FALSE FALSE
## 10 273 TRUE TRUE FALSE FALSE FALSE FALSE
## 11 275 TRUE TRUE FALSE FALSE FALSE FALSE
## 12 277 TRUE TRUE FALSE FALSE FALSE FALSE
## 13 285 TRUE FALSE TRUE FALSE FALSE FALSE
## 14 289 TRUE FALSE TRUE FALSE FALSE FALSE
## 15 293 TRUE FALSE TRUE FALSE FALSE FALSE
## 16 308 TRUE FALSE TRUE FALSE FALSE FALSE
## 17 313 TRUE FALSE TRUE FALSE FALSE FALSE
## 18 323 TRUE TRUE FALSE FALSE FALSE FALSE
## 19 326 TRUE FALSE TRUE FALSE FALSE FALSE
## 20 329 TRUE TRUE FALSE FALSE FALSE FALSE
## 21 330 TRUE FALSE TRUE FALSE FALSE FALSE
## 22 347 TRUE TRUE FALSE FALSE FALSE FALSE
## 23 352 TRUE TRUE FALSE FALSE FALSE FALSE
## 24 355 TRUE TRUE FALSE FALSE FALSE FALSE
## 25 366 TRUE TRUE FALSE FALSE FALSE FALSE
## 26 368 TRUE TRUE FALSE FALSE FALSE FALSE
## 27 369 TRUE FALSE TRUE FALSE FALSE FALSE
## 28 372 TRUE TRUE FALSE FALSE FALSE FALSE
## 29 375 TRUE TRUE FALSE FALSE FALSE FALSE
## 30 385 TRUE FALSE TRUE FALSE FALSE FALSE
## 31 388 TRUE TRUE FALSE FALSE FALSE FALSE
## 32 403 TRUE TRUE FALSE FALSE FALSE FALSE
## 33 408 TRUE TRUE FALSE FALSE FALSE FALSE
# Clean-up
rm(pc_outlier, pc1_outlier, pc2_outlier, pc3_outlier, pc4_outlier, pc5_outlier)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC1,PC2)) +
geom_point(aes(col=group, text=Sample_num)) +
labs(title="PC1 vs PC2", x="PC1", y="PC2") +
theme(legend.title=element_blank()) + # To suppress the legend title
myColourScale + # otherwise it would be "group"
geom_vline(xintercept=c(pc1_min, pc1_max), linetype="dotted", size=0.2) +
geom_hline(yintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC2,PC3)) +
geom_point(aes(col=group, text=Sample_num)) +
labs(title="PC2 vs PC3", x="PC2", y="PC3") +
theme(legend.title=element_blank()) +
myColourScale +
geom_vline(xintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# geom_hline(yintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2)
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC3,PC4)) +
geom_point(aes(col=group, text=Sample_num)) +
labs(title="PC3 vs PC4", x="PC3", y="PC4") +
theme(legend.title=element_blank()) +
myColourScale
## Warning: Ignoring unknown aesthetics: text
# geom_vline(xintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2) +
# geom_hline(yintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2)
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC4,PC5)) +
geom_point(aes(col=group, text=Sample_num)) +
labs(title="PC4 vs PC5", x="PC4", y="PC5") +
theme(legend.title=element_blank()) +
myColourScale
## Warning: Ignoring unknown aesthetics: text
# geom_vline(xintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2) +
# geom_hline(yintercept=c(pc5_min, pc5_max), linetype="dotted", size=0.2)
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g, myColourScale,
pc1_min, pc1_max, pc2_min, pc2_max, pc3_min, pc3_max, pc4_min, pc4_max, pc5_min, pc5_max)
save.image(paste(base_folder, "s01_add_PCs.RData", sep="/"))
ls()
## [1] "base_folder" "genotypes.mx" "phenotypes.df" "variants.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.9.0 ggplot2_3.2.0 dplyr_0.8.1 knitr_1.23
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 later_0.8.0 pillar_1.4.1 compiler_3.5.1 tools_3.5.1 digest_0.6.19 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.2 rlang_0.3.4 shiny_1.3.2 crosstalk_1.0.0 yaml_2.2.0 xfun_0.7 withr_2.1.2 stringr_1.4.0 httr_1.4.0 htmlwidgets_1.3 grid_3.5.1 tidyselect_0.2.5 glue_1.3.1 data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 purrr_0.3.2 tidyr_0.8.3 magrittr_1.5 promises_1.0.1 scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 xtable_1.8-4 mime_0.7 colorspace_1.4-1 httpuv_1.5.1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
Sys.time()
## [1] "2019-10-01 20:12:13 BST"